Create a histogram that shows the distribution of characters in a string. Use this function to draw a bar chart for the letters in your first name.
library(ggplot2)
string = "RAJKANWAR"
char_count = data.frame(table(strsplit(string, "")[[1]]))
colnames(char_count) = c("char", "count")
ggplot(char_count, aes(x = char, y = count)) +
geom_bar(stat = "identity", aes(fill = char)) +
scale_fill_brewer(palette = "Set1") +
ggtitle("Distribution of characters in first name")
Dendogram on US Arrests
library(plotly)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(ggplot2)
library(ggdendro)
hc1 <- hclust(dist(USArrests), "ave")
p1 <- ggdendrogram(hc1, rotate = FALSE, size = 2)
ggplotly(p1)
NA
Draw the linked view for Cloud multivariate dataset (available in UCI repository).
# Load the library for data visualization
library(ggplot2)
# Load the cloud dataset from UCI repository
idata <- read.csv("cloud.csv", header = TRUE)
# Plot the linked view for the multivariate dataset using ggplot2
ggplot(data = idata, aes(x = Visible_mean, y = IR_mean, color = contrast)) +
geom_point() +
geom_smooth(method = "lm") +
xlab("Visible Mean") +
ylab("IR Mean") +
ggtitle("Linked View for Cloud Multivariate Dataset: Visible VS IR Mean")
Draw a graph matrix for Image Segmentation multivariate dataset (available in UCI repository).
data <- read.csv("segmentation.csv", header = TRUE)
# Next, we'll use k-means clustering to group the data into clusters
library(stats)
kmeans_result <- kmeans(data, centers = 5)
clusters <- kmeans_result$cluster
# We'll use the 'ggplot2' package to plot the graph matrix
library(ggplot2)
ggplot(data, aes(x = 1, y = 2, color = as.factor(clusters))) +
geom_point(size = 3) +
scale_color_discrete(name = "Cluster") +
xlab("") + ylab("") +
ggtitle("Graph Matrix using K-Means Clustering")
Draw exploratory graphics of a stock market Dataset.
library(ggplot2)
# Load the dataset
library(tidyquant)
Loading required package: lubridate
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
Loading required package: quantmod
Loading required package: TTR
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
data <- tq_get("AAPL", get = "stock.prices")
# Plot a scatter plot of the closing price and volume
ggplot(data, aes(x = close, y = volume)) +
geom_point(color = "red") +
ggtitle("Scatter Plot of Closing Price and Volume")
# Plot a line graph of the closing price over time
ggplot(data, aes(x = date, y = close)) +
geom_line(color = "blue") +
ggtitle("Closing Price of Apple Stock Over Time")
# Plot a histogram of the daily return
ggplot(data, aes(x = adjusted)) +
geom_histogram(binwidth = 1, fill = "green") +
ggtitle("Histogram of Daily Returns")
US Arrests K-means clustering
library(ggpubr)
Loading required package: ggplot2
library(ggplot2)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the USArrests dataset
data("USArrests")
df <- USArrests
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(df), 3, nstart = 25)
# K-means clusters showing the group of each state
res.km$cluster
Alabama Alaska Arizona Arkansas California Colorado Connecticut
1 1 1 3 1 1 3
Delaware Florida Georgia Hawaii Idaho Illinois Indiana
3 1 1 3 2 1 3
Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts
2 3 2 1 2 1 3
Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
1 2 1 1 2 2 1
New Hampshire New Jersey New Mexico New York North Carolina North Dakota Ohio
2 3 1 1 1 2 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee
3 3 3 3 1 2 1
Texas Utah Vermont Virginia Washington West Virginia Wisconsin
1 3 2 3 3 2 2
Wyoming
3
fviz_cluster(res.km, data = df,
palette = c("#E69F00", "#56B4E9", "#009E73"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_bw()
)
# Dimension reduction using PCA
res.pca <- prcomp(df, scale = TRUE)
# Coordinates of states
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
ggscatter(
ind.coord, x = "PC1", y = "PC2",
color = "cluster", palette = c("#E69F00", "#56B4E9", "#009E73"), ellipse = TRUE, ellipse.type = "convex",
size = 1.5, legend = "right", ggtheme = theme_bw(),
xlab = paste0("PC1 (", variance.percent[1], "% )" ),
ylab = paste0("PC2 (", variance.percent[2], "% )" )
) + stat_mean(aes(color = cluster), size = 4)
Error:
! Problem while computing aesthetics.
ℹ Error occurred in the 2nd layer.
Caused by error:
! object 'PC1' not found
Backtrace:
1. base (local) `<fn>`(x)
2. ggplot2:::print.ggplot(x)
4. ggplot2:::ggplot_build.ggplot(x)
5. ggplot2:::by_layer(...)
12. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
13. l$compute_aesthetics(d, plot)
14. ggplot2 (local) compute_aesthetics(..., self = self)
15. ggplot2:::scales_add_defaults(...)
16. base::lapply(aesthetics[new_aesthetics], eval_tidy, data = data)
17. rlang (local) FUN(X[[i]], ...)
Visualize k-nearest-neighbor search, on Ranking in spatial dataset, using D3 quadtrees.
library(e1071)
library(caTools)
library(class)
library(ggplot2)
data(iris)
head(iris)
# Split data
split <- sample.split(iris, SplitRatio = 0.7)
train_cl <- subset(iris, split == "TRUE")
test_cl <- subset(iris, split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 1:4])
test_scale <- scale(test_cl[, 1:4])
# Fitting KNN Model to training dataset
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 1)
classifier_knn
[1] setosa setosa setosa setosa setosa setosa setosa setosa setosa
[10] setosa setosa setosa setosa setosa setosa setosa setosa setosa
[19] setosa setosa versicolor virginica versicolor versicolor versicolor versicolor versicolor
[28] versicolor virginica versicolor virginica versicolor versicolor versicolor versicolor versicolor
[37] versicolor versicolor versicolor versicolor virginica virginica virginica virginica virginica
[46] virginica virginica versicolor virginica virginica virginica virginica virginica versicolor
[55] virginica virginica virginica virginica virginica virginica
Levels: setosa versicolor virginica
# Confusiin Matrix
cm <- table(test_cl$Species, classifier_knn)
cm
classifier_knn
setosa versicolor virginica
setosa 20 0 0
versicolor 0 17 3
virginica 0 2 18
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
[1] "Accuracy = 0.916666666666667"
# K = 3
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 3)
misClassError <- mean(classifier_knn != test_cl$Species)
# K = 5
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 5)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
[1] "Accuracy = 0.966666666666667"
# K = 7
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 7)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
[1] "Accuracy = 0.966666666666667"
# K = 15
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 15)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
[1] "Accuracy = 0.966666666666667"
# K = 19
classifier_knn <- knn(train = train_scale,
test = test_scale,
cl = train_cl$Species,
k = 19)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
[1] "Accuracy = 0.966666666666667"
#Visiualisation
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
geom_point(size = 3) +
ggtitle("Iris Data Set") +
labs(x = "Petal Length", y = "Petal Width", color = "Species") +
theme_bw() +
geom_point(data = test_cl, aes(Petal.Length, Petal.Width, color = classifier_knn), size = 15, shape = 1)
Implement data visualization using dendrogram.
# Load data
data(mtcars)
# Compute distances and hierarchical clustering
dd <- dist(scale(mtcars), method = "euclidean")
#Ward_D2 means Instead of measuring the distance directly,
#it analyzes the variance of clusters
hc <- hclust(dd, method = "ward.D2")
library(factoextra)
Loading required package: ggplot2
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(hc, cex = 0.5)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
fviz_dend(hc, cex = 0.5,
main = "Dendrogram - ward.D2",
xlab = "Objects", ylab = "Distance", sub = "")
#fviz_dend(hc, cex = 0.5, horiz = TRUE)
fviz_dend(hc, k = 4, cex=0.5, k_colors = c("blue", "green", "red", "black"), # Cut in four groups
color_labels_by_k = TRUE, ggtheme = theme_gray() )
Visualize functional data with an application to eBay’s online auctions.
# Load required packages
library(fda)
library(fda.usc)
# Load Shill Bidding Dataset
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill%20Bidding%20Dataset.csv"
shill <- read.csv(url, header = TRUE)
# Drop non-relevant variables
shill <- shill[, c("Auction_Bids", "Auction_Duration")]
# Sort the data by Auction_Bids
shill <- shill[order(shill$Auction_Bids),]
# Create a functional data object from the Auction_Duration variable using B-splines
basis <- create.bspline.basis(rangeval = range(shill$Auction_Bids), nbasis = 10)
fd_shill <- smooth.basis(shill$Auction_Bids, shill$Auction_Duration, basis)
# Plot the functional data object
plot(fd_shill, xlab = "Number of Bids", ylab = "Auction Duration",
main = "Relationship between Bids and Auction Duration in eBay Auctions",
col = "red", lwd = 2)
[1] "done"
Show graphical data representation in classification using Iris dataset
# Load required libraries
library(ggplot2)
library(datasets)
library(reshape2)
# Load iris dataset
data(iris)
# Define custom color palette
my_colors <- c("#E69F00", "#56B4E9", "#009E73")
# Scatterplot with regression line
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(color = my_colors[1]) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Sepal Length", y = "Sepal Width", title = "Iris Dataset with Regression Line") +
theme_classic()
# Boxplot of Sepal length by Species
ggplot(iris, aes(Species, Sepal.Length)) +
geom_boxplot(fill = my_colors[2]) +
labs(x = "Species", y = "Sepal Length", title = "Iris Dataset: Sepal Length by Species") +
theme_classic()
# Histogram of Petal width by Species
ggplot(iris, aes(Petal.Width, fill = Species)) +
geom_histogram(alpha = 0.5, bins = 30) +
scale_fill_manual(values = my_colors) +
labs(x = "Petal Width", y = "Count", title = "Iris Dataset: Petal Width by Species") +
theme_classic()
# Stacked bar chart of Petal length by Species
ggplot(iris, aes(Species, Petal.Length, fill = Species)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = my_colors) +
labs(x = "Species", y = "Petal Length", title = "Iris Dataset: Petal Length by Species") +
theme_classic()
# Heatmap of correlations between variables in iris dataset
iris_cor <- round(cor(iris[,1:4]), 2) # Calculate correlation matrix
ggplot(data = melt(iris_cor), aes(Var2, Var1, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = my_colors[1], high = my_colors[3], mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
labs(x = "", y = "", title =
"Correlations between variables in Iris dataset")
Draw a graph matrix for mushroom dataset (available in UCI repository).
# Load the necessary packages
library(readr)
# Load the mushroom dataset
mushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
col_names = FALSE)
Rows: 8124 Columns: 23── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (22): X1, X2, X3, X4, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X...
lgl (1): X5
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
# Draw the graph matrix
ggpairs(mushrooms, aes(colour = mushrooms$X1))
Cluster a mtcars dataset using a k-means algorithm and visualize using Quadtree.
# Load the mtcars dataset
data(mtcars)
# Perform k-means clustering with 3 clusters
set.seed(123) # for reproducibility
k <- 3
fit <- kmeans(mtcars, k)
# Load the quadtree package
library(quadtree)
Error: package or namespace load failed for ‘quadtree’ in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
there is no package called ‘terra’